%pylab inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
from random import random
from datetime import datetime
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn import preprocessing
from sklearn.decomposition import PCA
# Boston
Boston_Calendar_df = pd.read_csv('Boston_calendar.csv')
Boston_listings_df = pd.read_csv('Boston_listings.csv')
Boston_reviews_df = pd.read_csv('Boston_reviews.csv')
# Seattle
Seattle_Calendar_df = pd.read_csv('Seattle_calendar.csv')
Seattle_listings_df = pd.read_csv('Seattle_listings.csv')
Seattle_reviews_df = pd.read_csv('Seattle_reviews.csv')
For the calendar dataframe, all the columns are interesting.
print(Boston_Calendar_df.shape)
Boston_Calendar_df.columns
print(Boston_listings_df.shape)
for i in Boston_listings_df.columns:
print(i)
print(Boston_listings_df[i].iloc[1])
print('\n')
I will drop the columns that I will not look into and make a list of the columns that I will. I would like to keep that which might be found useful for my endevour. There are some that might be interesting which we have not inlcuded becuase the average was very skewed, for example, in the binary variable of host_has_profile_pic, all hosts had a pic except for 8.
column_list_listing = ['host_response_time', 'host_location', 'host_response_rate', 'host_acceptance_rate',\
'host_is_superhost', 'neighbourhood', 'city', 'zipcode', 'latitude', 'longitude','is_location_exact', \
'property_type', 'room_type', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'bed_type',\
'price', 'security_deposit', 'cleaning_fee', 'square_feet', 'number_of_reviews', 'review_scores_rating',\
'review_scores_cleanliness', 'review_scores_accuracy', 'review_scores_checkin', 'review_scores_communication', \
'review_scores_location', 'review_scores_value']
Boston_listings_df = Boston_listings_df[column_list_listing]
Seattle_listings_df = Seattle_listings_df[column_list_listing]
Looking at the columns of Boston_reviews, our analysis will not need this dataset.
print(Boston_reviews_df.shape)
Boston_reviews_df.columns
Boston_Calendar_df.info()
I am interested in price, and I see that there are missing values. I will delete them. I will however not impute them, as I am inrerested in a trend, and the fake imputed values would not add insights. We will not study the reasons behind nan values, it is out of the scope of the question. I will apply the same technique to Seattle.
# We delete rows with nan price values.
# Boston
Boston_Calendar_df = Boston_Calendar_df.dropna(subset=['price'], how='any')
# Seattle
Seattle_Calendar_df = Seattle_Calendar_df.dropna(subset=['price'], how='any')
# We look at the types and how the values look like
print(type(Boston_Calendar_df['date'].iloc[0]), type(Boston_Calendar_df['available'].iloc[0]), type(Boston_Calendar_df['price'].iloc[0]))
print(Boston_Calendar_df['date'].iloc[0], Boston_Calendar_df['available'].iloc[0], Boston_Calendar_df['price'].iloc[0])
So we need to convert:
# 1. price: from str to float
# We delete the "$" and the "," from the values. It uses commas to indicate thousands
# Boston
Boston_Calendar_df['price'] = Boston_Calendar_df['price'].apply(lambda row: row.replace('$', ''))
Boston_Calendar_df['price'] = Boston_Calendar_df['price'].apply(lambda row: row.replace(',', ''))
# Now we convert the str to float
Boston_Calendar_df = Boston_Calendar_df.astype({"price": float})
# Seattle
Seattle_Calendar_df['price'] = Seattle_Calendar_df['price'].apply(lambda row: row.replace('$', ''))
Seattle_Calendar_df['price'] = Seattle_Calendar_df['price'].apply(lambda row: row.replace(',', ''))
# Now we convert the str to float
Seattle_Calendar_df = Seattle_Calendar_df.astype({"price": float})
# 2. date: from str to datetime
# REF: https://stackabuse.com/converting-strings-to-datetime-in-python/
# Boston
Boston_Calendar_df['date'] = Boston_Calendar_df['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d').date())
# Seattle
Seattle_Calendar_df['date'] = Seattle_Calendar_df['date'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d').date())
# 3. available: from str to bool (for convinience)
# However, there is only one value for 'available'. So we do not consider this column. Likewise for Seattle, to have equal datasets
Boston_Calendar_df['available'].value_counts()
# We take the columns that we need
column_list_calendar = ['listing_id', 'date', 'price']
# Boston
Boston_Calendar_df = Boston_Calendar_df[column_list_calendar]
# Seattle
Seattle_Calendar_df = Seattle_Calendar_df[column_list_calendar]
# Let us check that our conversations have worked
print(type(Boston_Calendar_df['date'].iloc[0]), type(Boston_Calendar_df['price'].iloc[0]))
print(Boston_Calendar_df['date'].iloc[0], Boston_Calendar_df['price'].iloc[0])
Boston_Calendar_df.groupby('date').price.mean().plot(x='date', y='price', color='red', title='AirB&B prices overtime', legend=True)
Seattle_Calendar_df.groupby('date').price.mean().plot(x='date', y='price', color='blue', legend=True)
plt.xticks(rotation=45)
plt.legend(['Price Boston', 'Price Seattle'])
The answer below indicates a 45% greater in avergae overtime.
Boston_Calendar_df.groupby('date').price.mean().mean() / Seattle_Calendar_df.groupby('date').price.mean().mean()
The prices in Boston have had a larger delta than in Seattle. Check the data below.
Boston_delta = Boston_Calendar_df.groupby('date').price.mean().max() - Boston_Calendar_df.groupby('date').price.mean().min()
Seattle_delta = Seattle_Calendar_df.groupby('date').price.mean().max() - Seattle_Calendar_df.groupby('date').price.mean().min()
print('Boston delta', Boston_delta)
print('Seattle delta', Seattle_delta)
print('Delta ratio Boston/Seattle', Boston_delta/Seattle_delta)
Something to note before writing the conclusions is that the data from Boston and Seattle do not overlap in dates 100%, they overlap only by the end of 2016 and begining of 2017. The first an obvious conclusion is that the prices of AirB&Bs in Boston (red) are much greater overtime in average, a 45%. The second conclusion is that the prices for Boston fluctuate more than in Seattle, the deltas are more erratic. The third conclusion is that the prices of Boston have a downward trend, while in Seattle have an upward trend. And the fourth conclusion is that the maximum delta of prices in Boston is 176% higher than in Seattle. The downward trend on Boston at the begining might be due to a hype of AirB&B when it was first released in Boston. Then market forces reduced the avergae prices of the households and around halfway on its trendline, the prices start to pick up again in a mild upward trend, like the one Seattle had in the beginning. Another interesting feature of the Boston curve to check is the spike that sprouts around April 2017. Doing some googling, I found the event that might have caused the spike, a massive marathon (26k runners): https://en.wikipedia.org/wiki/2017_Boston_Marathon
Seattle_listings_df.info()
Boston_listings_df.info()
We are interested in the price, thus, we can drop any row that does not contain a price value
# We delete rows with nan price values.
# Boston
Boston_listings_df = Boston_listings_df.dropna(subset=['price'], how='any')
# Seattle
Seattle_listings_df = Seattle_listings_df.dropna(subset=['price'], how='any')
# In case there are any:
#Drop columns with all NaN values
Boston_listings_df = Boston_listings_df.dropna(how='all', axis=1)
#Drop columns with all NaN values
Seattle_listings_df = Seattle_listings_df.dropna(how='all', axis=1)
for attribute in Boston_listings_df.columns:
print(attribute, type(Boston_listings_df[attribute].iloc[0]))
print(Boston_listings_df[attribute].iloc[45])
Some are already in data types I can work with, others however must be converted:
To categories
To float
# Function to create dummy variables
def create_dummy_df(df, cat_cols):
for col in cat_cols:
try:
# for each cat add dummy var, drop original column
df = pd.concat([df.drop(col, axis=1), pd.get_dummies(df[col], prefix=col, prefix_sep='_', drop_first=True, dummy_na=False)], axis=1)
except:
continue
return df
# We select the columns that will be converted into dummy variables
cat_cols = ['host_response_time', 'host_location', 'bed_type', 'room_type', 'property_type', 'city', 'zipcode', 'neighbourhood', 'host_is_superhost', 'is_location_exact']
Boston_listings_df = create_dummy_df(Boston_listings_df, cat_cols)
Seattle_listings_df = create_dummy_df(Seattle_listings_df, cat_cols)
# We select the columns that will be converted into floats
columns_to_float = ['cleaning_fee', 'security_deposit', 'price', 'host_acceptance_rate', 'host_response_rate']
# Boston
# We change the nan float values to string, otherwise .replace would give error handling floats
# REF: https://stackoverflow.com/questions/36000993/numpy-isnan-fails-on-an-array-of-floats-from-pandas-dataframe-apply
def convert_value_into_float(df, columns_to_float):
for col in columns_to_float:
df[col] = df[col]\
.apply(lambda x: x if pd.isnull(np.array([x])) else x.replace('$', ''))
df[col] = df[col]\
.apply(lambda x: x if pd.isnull(np.array([x])) else x.replace(',', ''))
df[col] = df[col]\
.apply(lambda x: x if pd.isnull(np.array([x])) else x.replace('%', ''))
# Convert them into float
df[col] = df[col]\
.apply(lambda x: x if pd.isnull(np.array([x])) else float(x))
return df
Boston_listings_df = convert_value_into_float(Boston_listings_df, columns_to_float)
Seattle_listings_df = convert_value_into_float(Seattle_listings_df, columns_to_float)
# Fill numeric columns with the mean where a nan value exists
def fill_nan_with_mean(df):
num_vars = df.select_dtypes(include=['float', 'int']).columns
for col in num_vars:
df[col].fillna((df[col].mean()), inplace=True)
return df
Boston_listings_df = fill_nan_with_mean(Boston_listings_df)
Seattle_listings_df = fill_nan_with_mean(Seattle_listings_df)
# Boston
g = sns.pairplot(Boston_listings_df.sample(1000, random_state=42), \
vars=['price', 'host_response_rate', 'host_acceptance_rate', 'accommodates', 'bathrooms', \
'beds', 'bedrooms', 'security_deposit', 'cleaning_fee', 'square_feet', \
'number_of_reviews'],kind="reg")
We are only interedted in the first row for price. We see that the response and acceptance rate are not correlated to the price, but as it is logical. However, the number of people that accomodates, the number of rooms like bathrooms, bedrroms, square feet, are correlated. Also the cleaning fee is correlated to price, it makes sense because the larger the place, the more it costs to clean it. Likewise with respect to the security deposit. However, there is no correlation between price and number of reviews. I would have assumed that higher prices, better accomodations and thus more people would be willing to leave a review, but I was wrong.
Let us have a look into other variables related to reviews.
# Boston
g = sns.pairplot(Boston_listings_df.sample(1000, random_state=42), \
vars=['price', 'review_scores_rating', 'review_scores_cleanliness', 'review_scores_accuracy', 'review_scores_checkin', \
'review_scores_communication', 'review_scores_location', 'review_scores_value'] \
,kind="reg")
There is a very slight positive correlation between price and reviews, but nothing to note. This means that the quality/quantity of the apartment in terms of fees, accomodation, etc. which can be represented by the price (as we have seen before), is not an indicator of how happy a customer is.
Let us see if it is the same conclsion for Seattle:
# Seattle
g = sns.pairplot(Seattle_listings_df.sample(1000, random_state=45), \
vars=['price', 'host_response_rate', 'host_acceptance_rate', 'accommodates', 'bathrooms', \
'beds', 'bedrooms', 'security_deposit', 'cleaning_fee', 'square_feet', \
'number_of_reviews'],kind="reg")
They have the same positive correlation as it happened in Boston.
# Seattle
g = sns.pairplot(Seattle_listings_df.sample(1000, random_state=42), \
vars=['price', 'review_scores_rating', 'review_scores_cleanliness', 'review_scores_accuracy', 'review_scores_checkin', \
'review_scores_communication', 'review_scores_location', 'review_scores_value'] \
,kind="reg")
With respect to the second anaylsis looking into the review scores, we conclude the same as we did with Boston.
Thus we can conclude that the attributes that best explain price are the accomodation itself, the number of people that can be accomodated and the size of the apartment (square fit, and number of rooms)
I will do a PCA analysis to have another perspective to the correlations. This however does not target price directly, it provides a general view of what is positively or negatively correlated. Thus, if price is correlated strongly with a variable, we will observe it in the first principle components as well.
# We perform scaling
scaler = preprocessing.StandardScaler()
Boston_listings_df_scaled = scaler.fit_transform(Boston_listings_df) # Fit and transform the data
Boston_listings_df_scaled = pd.DataFrame(Boston_listings_df_scaled, index=Boston_listings_df.index, columns=Boston_listings_df.columns) #create a dataframe
Boston_listings_df_scaled.columns = Boston_listings_df_scaled.columns #add column names again
Boston_listings_df
# We create the PCA object
pca = PCA()
Boston_listings_df_pca = pca.fit_transform(Boston_listings_df_scaled)
len(pca.components_)
# We check the sum is 1
ratios = pca.explained_variance_ratio_.reshape(len(pca.components_), 1)
ratios.sum()
# Function to plot the explained variability
# Ref: Course
def scree_plot(pca):
num_components=len(pca.explained_variance_ratio_)
ind = np.arange(num_components)
vals = pca.explained_variance_ratio_
plt.figure(figsize=(10, 6))
ax = plt.subplot(111)
cumvals = np.cumsum(vals)
ax.bar(ind, vals)
ax.plot(ind, cumvals)
for i in range(num_components):
ax.annotate(r"%s%%" % ((str(vals[i]*100)[:4])), (ind[i]+0.2, vals[i]), va="bottom", ha="center", fontsize=12)
ax.xaxis.set_tick_params(width=0)
ax.yaxis.set_tick_params(width=2, length=12)
ax.set_xlabel("Principal Component")
ax.set_ylabel("Variance Explained (%)")
plt.title('Explained Variance Per Principal Component')
scree_plot(pca)
vals = pca.explained_variance_ratio_
We can do with the fields that explain at least 85 percent of the variability. It is enough for our endevour and we will need less variables for computation later on if we need to use the principle components as inputs for our models.
cum_variance = 0
index = 0
for ind, val in enumerate(vals):
if cum_variance < 0.85:
cum_variance += val
index = ind
else:
break
# We print the index of the variable at which the cumulative variance first exceeds 85% of the explained variance
print(index, cum_variance)
# Re-apply PCA to the data while selecting for number of components to retain. One more component than the index because the index of
# the previous loop starts at 0, but this object PCA does not
pca_final = PCA(220)
Boston_listings_df_pca = pca_final.fit_transform(Boston_listings_df_scaled)
# We should get around 0.85
ratios = pca_final.explained_variance_ratio_.reshape(len(pca_final.components_), 1)
ratios.sum()
scree_plot(pca_final)
def pca_results(full_dataset, pca, ith_component):
'''
Create a DataFrame of the PCA results
Includes dimension feature weights and explained variance
Visualizes the PCA results
'''
# Dimension indexing
#dimensions = dimensions = ['Dimension {}'.format(i) for i in range(1,len(pca.components_)+1)]
dimensions = dimensions = ['Dimension {}'.format(i) for i in range(1,len(pca.components_)+1)]
# PCA components
components = pd.DataFrame(np.round(pca.components_, 4), columns = full_dataset.keys())
components.index = dimensions
# PCA explained variance
ratios = pca.explained_variance_ratio_.reshape(len(pca.components_), 1)
variance_ratios = pd.DataFrame(np.round(ratios, 4), columns = ['Explained Variance'])
variance_ratios.index = dimensions
# Return a concatenated DataFrame
return pd.concat([variance_ratios, components], axis = 1).iloc[ith_component]
From Udacity: "As a reminder, each principal component is a unit vector that points in the direction of highest variance (after accounting for the variance captured by earlier principal components). The further a weight is from zero, the more the principal component is in the direction of the corresponding feature. If two features have large weights of the same sign (both positive or both negative), then increases in one tend expect to be associated with increases in the other. To contrast, features with different signs can be expected to show a negative correlation: increases in one variable should result in a decrease in the other."
I am going to plot the highest and lowest weights of the first 3 principle components (the ones that explain more variability out of the 219). Each PC will tell us what is positively and negatively correlated within them.
first_PC = pca_results(Boston_listings_df_scaled, pca_final, 0)
first_PC_top_weights = pd.concat([first_PC.sort_values().head(4), first_PC.sort_values().tail(4)])
first_PC_top_weights.plot.bar(grid=True)
plt.show()
It is logical to think that latitude and host location is correlated. That the cleaning fee increases with latitute and host location might be due to the fact that in those regions with higher latitutde the salaries for workers are higher. And it is also interesting that apparently,the response time is usually of a few hours when the latitutde is higher, but we can logically conclude that it is a mere coincidence, most probably with a larger dataset, this would not be positively correlated.
It is starightforward to concluce that given a low review of one aspect, the probability that the customer is unhappy with other aspects is also high.
These 2 sets are inversily correlated.
first_PC = pca_results(Boston_listings_df_scaled, pca_final, 1)
first_PC_top_weights = pd.concat([first_PC.sort_values().head(4), first_PC.sort_values().tail(4)])
first_PC_top_weights.plot.bar(grid=True)
plt.show()
Apparently, there are more private rooms in houses. It makes sense as these are usually larger than other types of accomodatons. The acceptance rate is also correlated and the city Massachusetts.
This is a very good indicator that the previous anaylsis was on point. The lower the cleaning fee, number of beds and the numbner of people it accomodates, the lower the price.
These 2 sets are inversily correlated.
first_PC = pca_results(Boston_listings_df_scaled, pca_final, 2)
first_PC_top_weights = pd.concat([first_PC.sort_values().head(4), first_PC.sort_values().tail(4)])
first_PC_top_weights.plot.bar(grid=True)
plt.show()
I will use a linear and a polynomial model on the raw data.
def clean_fit_linear_mod(df, response_col):
#Split into explanatory and response variables
X = df.drop(response_col, axis=1)
y = df[response_col]
#Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43)
lm_model = LinearRegression(normalize=True) # Instantiate
lm_model.fit(X_train, y_train) #Fit
#Predict using your model
y_test_preds = lm_model.predict(X_test)
y_train_preds = lm_model.predict(X_train)
#Score using your model
test_score = r2_score(y_test, y_test_preds)
train_score = r2_score(y_train, y_train_preds)
return test_score, train_score, lm_model, X_train, X_test, y_train, y_test
def clean_fit_polynomial_mod(df, response_col):
#Split into explanatory and response variables
X = df.drop(response_col, axis=1)
y = df[response_col]
poly = PolynomialFeatures(degree=2) # Instantiate
X_poly = poly.fit_transform(X) #Fit
#Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.3, random_state=43)
regression = LinearRegression()
poly_model = regression.fit(X_train, y_train)
#Predict using your model
y_test_preds = poly_model.predict(X_test)
y_train_preds = poly_model.predict(X_train)
#Score using your model
test_score = r2_score(y_test, y_test_preds)
train_score = r2_score(y_train, y_train_preds)
return test_score, train_score, lm_model, X_train, X_test, y_train, y_test
# Linear model
# BOSTON
test_score, train_score, lm_model, X_train, X_test, y_train, y_test = \
clean_fit_linear_mod(Boston_listings_df, 'price')
print('Boston test score', test_score, 'train score', train_score)
# SEATTL
test_score, train_score, lm_model, X_train, X_test, y_train, y_test = \
clean_fit_linear_mod(Seattle_listings_df, 'price')
print('Seattle test score', test_score, 'train score', train_score)
The model does a terrible job predicting the price for both cities.
# Polynomial model
# BOSTON
test_score, train_score, lm_model, X_train, X_test, y_train, y_test = \
clean_fit_polynomial_mod(Boston_listings_df, 'price')
print('Boston test score', test_score, 'train score', train_score)
# SEATTLE
test_score, train_score, lm_model, X_train, X_test, y_train, y_test = \
clean_fit_polynomial_mod(Seattle_listings_df, 'price')
print('Seattle test score', test_score, 'train score', train_score)
The model does better with the training data but it is terrible in the testing, worst than with the linear model, even though we tried with polynomials of 2 and 3 (3 hangs for too long).
Conclusion: We have seen the correlations between price and the rest of the variables, and explained also the different correlations that exist between the variables in the dataset through PCR. We have also tried to predict the price with a linear and a polynomial mode, but not no avail. This means that either there is no causation or we have not enough data. We have also tried to include in the prediction other fields that we had initially excluded like availability and min and max nights, but the model does not improve whatsoever.
We will perform the same data anaylsiss but with review_scores_rating, and we will also try to predict this value. The data wrangling remains the same.
We are interested in the review score. All the review attributes have more or less the same amount of non-null values. Seeing that review_score_accuray has the lowest number of non-null values(by a hair), I will drop the rows that contain a null value in that field. This way, there will be a few edge cases where eg the accuracy or the checkin or the communication are missing while the rating is not. Luckily, for Seattle the same happens.
# We delete rows with nan review_scores_rating values.
# Boston
Boston_listings_df = Boston_listings_df.dropna(subset=['review_scores_rating'], how='any')
# Seattle
Seattle_listings_df = Seattle_listings_df.dropna(subset=['review_scores_rating'], how='any')
# Boston
g = sns.pairplot(Boston_listings_df.sample(1000, random_state=42), \
vars=['review_scores_rating', 'price', 'host_response_rate', 'host_acceptance_rate', 'accommodates', 'bathrooms', \
'beds', 'bedrooms', 'security_deposit', 'cleaning_fee', 'square_feet', \
'number_of_reviews'],kind="reg")
Looking again only at the first row of figures: We can see that there is a positive correlation between good ratings and price. We did not see this in the identical study we did for price. There is a slight positive correlation with the response rate and with the security deposit. And on the negative side, there is one with the square feet.
# Boston
g = sns.pairplot(Boston_listings_df.sample(1000, random_state=42), \
vars=['review_scores_rating', 'review_scores_rating', 'review_scores_cleanliness', 'review_scores_accuracy', 'review_scores_checkin', \
'review_scores_communication', 'review_scores_location', 'review_scores_value'] \
,kind="reg")
It was expected to obtain strong positive correlations with the rest of review fields, and indeed, one can observe these relationships in the figures.
We expect to see the same relationships as in Boston as in Seattle, as there was not much difference between them with the previous analysis from the previous question.
# Seattle
g = sns.pairplot(Seattle_listings_df.sample(1000, random_state=40), \
vars=['review_scores_rating', 'price', 'host_response_rate', 'host_acceptance_rate', 'accommodates', 'bathrooms', \
'beds', 'bedrooms', 'security_deposit', 'cleaning_fee', 'square_feet', \
'number_of_reviews'],kind="reg")
# Seattle
g = sns.pairplot(Seattle_listings_df.sample(1000, random_state=42), \
vars=['review_scores_rating', 'review_scores_rating', 'review_scores_cleanliness', 'review_scores_accuracy', 'review_scores_checkin', \
'review_scores_communication', 'review_scores_location', 'review_scores_value'] \
,kind="reg")
Let us see if we do a better job predicting the review_scores_rating. Initially, we will not drop the rest of revies to see how well it does. Afterwards, we will delete them and see how it performs.
# Linear model
# BOSTON
test_score, train_score, lm_model, X_train, X_test, y_train, y_test = \
clean_fit_linear_mod(Boston_listings_df, 'review_scores_rating')
print('Boston test score', test_score, 'train score', train_score)
# SEATTL
test_score, train_score, lm_model, X_train, X_test, y_train, y_test = \
clean_fit_linear_mod(Seattle_listings_df, 'review_scores_rating')
print('Seattle test score', test_score, 'train score', train_score)
We see that in the training it does much better than when predicting price, but in the testing once again it does terrible. Given this terrible performance, it is not even worth the time to delete key fields like the rest of reviews, as it can only perform worst.
# Polynomial model
# BOSTON
test_score, train_score, lm_model, X_train, X_test, y_train, y_test = \
clean_fit_polynomial_mod(Boston_listings_df, 'price')
print('Boston test score', test_score, 'train score', train_score)
# SEATTLE
test_score, train_score, lm_model, X_train, X_test, y_train, y_test = \
clean_fit_polynomial_mod(Seattle_listings_df, 'price')
print('Seattle test score', test_score, 'train score', train_score)
There is also not much to say about the terrible performance.
My conclusion with respect to prediction is that we need more data to have accurate results.